% updated 12/23/2015
% random sub-sample of the data

% model 1: Y = lambda*A*Y + rho*Y_ttl + X*b + error
%% CONTROL PANEL
%clear
parentpath = cd(cd('..'));

ssd = 20161220;
rng(ssd)

option = optimoptions(@fminunc,'Algorithm','trust-region','GradObj','on','Hessian','on','Display','notify','DerivativeCheck','off');

warning('ON','msg:id')
dataset = 'cati_merged_sdc_rnd'; % Choose from: {'cati','sdc_rnd','sdc_rnd_ctt','cati_merged_sdc_rnd','cati_merged_sdc_rnd_ctt'}
rndlink = 0; % 1 if link duration is random, 0 if link duration is fixed
lnkyear = 5; % Choose from: {3,4,5,6,7}.
sicdgit = 1; % Choose from: {1,10,100,1000}
timelag = 1; % time lag of RND stock
%sampler = 0.7; % sampling rate

fac = 1e3; % scale factor for output
rep = 500;

yr1 = 1966;
yr2 = 2006;

% drop the first t0 years (i.e. the starting year is (yr1+t0))
t0 = 1;
%%
switch dataset
    case 'cati'
        n = 11093;
    case 'sdc_rnd'
        n = 12253;
    case 'sdc_rnd_ctt'
        n = 21387;
    case 'cati_merged_sdc_rnd'
        n = 19654;
    case 'cati_merged_sdc_rnd_ctt'
        n = 27147;
    otherwise
        disp('No appropriate dataset specified.')
end
T = yr2-yr1+1;
%%
ID = zeros(n,2);
ID(:,1) = (1:n)';
for s = 1:T
    yr = yr1+s-1;
    file = [parentpath '/' dataset '/balance_sheets/' int2str(yr) '_ID SIC SALES PROFIT RND EMPL FIXED_CAPITAL TOT_COST.csv'];
    flid = fopen(file);
    dat0 = textscan(flid,'%s %s %s %s %s %s %s %s','delimiter',';','headerlines',1);
    fclose(flid);
    %---------------------------------   
    FID = str2double(strtrim(dat0{1})); % FID
    SIC = str2double(strtrim(dat0{2})); % SIC
    FID(isnan(FID)) = 0;
    SIC(isnan(SIC)) = 0;
    SIC(floor(SIC/1000)==0) = 0; % drop firms with less than 4-digit SIC
    
    for i = 1:n
        if SIC(FID==i) > 0
            if SIC(FID==i) ~= ID(i,2)
                if ID(i,2) == 0
                    ID(i,2) = SIC(FID==i);
                else
                    ID(i,2) = -1;
                    %warning('msg:id','SIC code changes over time')
                end
            end
        end
    end
end
ID(:,2) = floor(ID(:,2)/sicdgit);
ID = ID(ID(:,2)>0,:); % drop firms with missing SIC
n = size(ID,1);

file = [parentpath '/' dataset '/location/' 'ID NATION_ID NATION CITY LOC_X LOC_Y.csv'];
flid = fopen(file);
dat0 = textscan(flid,'%s %s %s %s %s %s','delimiter',';','headerlines',1);
fclose(flid);
tmp1 = str2double(strtrim(dat0{1}));
tmp2 = str2double(strtrim(dat0{2}));

loc = zeros(size(ID,1),1);
for i = 1:n
    if sum(tmp1==ID(i,1)) == 1
        loc(i) = tmp2(tmp1==ID(i,1));
    end
end
ID = ID(loc==235,:);
n = size(ID,1);

ID0 = ID;
n0 = n;
m = floor(sampler*n0);
%%
nfirm = zeros(1,rep);
b3 = zeros(3,rep);
for j = 1:rep
j
ID = ID0;
n = n0;
ridx = randperm(n); 
idx1 = ridx(1:m);
idx2 = ridx(m+1:n);
%%
dm = cell(T,1);
X1 = cell(T,1);
X2 = cell(T,1);
X3 = cell(T,1);
for s = 1:T
    yr = yr1+s-1;
    %---------------------------------
    firm = zeros(n,3);
    %---------------------------------
    file = [parentpath '/' dataset '/balance_sheets/' int2str(yr) '_ID OUTPUT.csv'];
    flid = fopen(file);
    dat1 = textscan(flid,'%s %s','delimiter',';','headerlines',1);
    fclose(flid);
    fid1 = str2double(strtrim(dat1{1}));
    outp = str2double(strtrim(dat1{2}));
    %---------------------------------
    file = [parentpath '/' dataset '/balance_sheets/' int2str(yr-timelag) '_ID RND_STOCK.csv'];
    flid = fopen(file);
    dat2 = textscan(flid,'%s %s','delimiter',';','headerlines',1);
    fclose(flid);
    fid2 = str2double(strtrim(dat2{1}));
    stck = str2double(strtrim(dat2{2}));
    %---------------------------------
    for i = 1:n
        if (sum(fid1==ID(i,1))==1)&&(sum(fid2==ID(i,1))==1)
            firm(i,1) = outp(fid1==ID(i,1))/fac;
            firm(i,2) = stck(fid2==ID(i,1));
            firm(i,3) = ID(i,2);
        end
    end
    firm(isnan(firm)) = 0;
    %---------------------------------
    file = [parentpath '/' dataset '/tot_output_compustat/' int2str(yr) '_Compustat_US_deflated_TOT_OUTPUT.csv'];
    flid = fopen(file);
    dta1 = textscan(flid,'%s %s','delimiter',';','headerlines',1);
    fclose(flid);
    sec1 = [str2double(strtrim(dta1{1})),str2double(strtrim(dta1{2}))/fac];
    sec1(isnan(sec1)) = 0;
    sec1(:,1) = floor(sec1(:,1)/sicdgit);
    %---------------------------------
    file = [parentpath '/' dataset '/tot_rnd_stock/' int2str(yr-timelag) '_Compustat_US_TOT_RND_STOCK.csv'];
    flid = fopen(file);
    dta2 = textscan(flid,'%s %s','delimiter',';','headerlines',1);
    fclose(flid);
    sec2 = [str2double(strtrim(dta2{1})),str2double(strtrim(dta2{2}))];
    sec2(isnan(sec2)) = 0;
    sec2(:,1) = floor(sec2(:,1)/sicdgit);
    %---------------------------------
    % add the sector-level data to the firm-level data
    aggr = zeros(n,2);
    drop = firm(idx2,:);
    for i = 1:n
        aggr(i,1) = sum(sec1(sec1(:,1)==ID(i,2),2))-firm(i,1)-sum(drop(drop(:,3)==ID(i,2),1));
        aggr(i,2) = sum(sec2(sec2(:,1)==ID(i,2),2))-firm(i,2)-sum(drop(drop(:,3)==ID(i,2),2));
    end
    %---------------------------------
    % load RND adjacency matrix
    if rndlink == 1
        file = [parentpath '/' dataset '/adjacency_matrix_rand_' num2str(lnkyear) '_years/' int2str(yr) '_adjacency_matrix.mat'];
    else
        file = [parentpath '/' dataset '/adjacency_matrix_' num2str(lnkyear) '_years/' int2str(yr) '_adjacency_matrix.mat'];
    end
    load(file)
    
    A = A(ID(:,1),:);
    A = A(:,ID(:,1));
    
    A = A(idx1,:);
    A = A(:,idx1);
    %---------------------------------
    % indicate firms with missing observations
    dm{s} = ones(n,1);
    for i = 1:n
        if (min(firm(i,:))<=0)||(min(aggr(i,:))<0)
            dm{s}(i) = 0;
        end
    end
    dm{s} = dm{s}(idx1);
    %---------------------------------
    % define variables
    X1{s} = firm(idx1,1:2);
    X2{s} = A*firm(idx1,1:2);
    X3{s} = aggr(idx1,1:2);
end
ID = ID(idx1,:);
n  = m;
%%
% drop the first t0 year(s)
dt = cat(1,dm{:});
dt = dt(n*t0+1:end);
T1 = T-t0;
%---------------------------------
% drop firms that appear less than twice
di = (1:n*T1)';
dj = kron(ones(T1,1),(1:n)');
Dn = sparse(di,dj,dt);
dd = sum(Dn,1);
Dn = Dn(:,dd>1); % firms appearing more than once in the panel
np = size(Dn,2);

E  = speye(n);
E  = E(dd>1,:); % firms appearing more than once in the panel
ID = E*ID;
for s = 1:T
    dm{s} = E*dm{s};
    X1{s} = E*X1{s};
    X2{s} = E*X2{s};
    X3{s} = E*X3{s};
end
%%
di  = cell(T1,1);
dj1 = cell(T1,1);
dj2 = cell(T1,1);
dj3 = cell(T1,1);
n1 = 0;
n2 = 0;
for s = 1:T1
    D = Dn(n*(s-1)+1:n*s,:);
    D = D(sum(D,2)==1,:); % firms without missing observations
    [ni,nj] = size(D); % note: ni is the number of nonzero entries of D
    [ii,jj,~] = find(D);
    di{s}  = n1+ii;
    dj1{s} = jj;
    dj2{s} = s*ones(ni,1);
    dj3{s} = n2+jj;
    
    n1 = n1+ni;
    n2 = n2+nj;
end
di  = cat(1,di{:});
dj1 = cat(1,dj1{:});
dj2 = cat(1,dj2{:});
dj3 = cat(1,dj3{:});

D1 = sparse(di,dj1,ones(n1,1),n1,np);
D2 = sparse(di,dj2,ones(n1,1),n1,T1); % block diagonal matrix of D*1
D3 = sparse(di,dj3,ones(n1,1),n1,n2); % block diagonal matrix of D

DD1 = (D1'*D1)\D1';
DD1 = speye(n1)-D1*DD1;

D4  = DD1*D2;
DD4 = (D4'*D4)\D4';
DD4 = D4*DD4;
PD3 = (DD1-DD4)*D3;
%%
Yn = cell(T1,1);
Zn = cell(T1,1);
Qn = cell(T1,1);
for s = (t0+1):T
    Yn{s-t0} = X1{s}(:,1);
    Zn{s-t0} = [X2{s}(:,1),X3{s}(:,1),X1{s}(:,2)];
    Qn{s-t0} = [X2{s}(:,2),X3{s}(:,2),X1{s}(:,2)];
end
Yn = cat(1,Yn{:});
Zn = cat(1,Zn{:});
Qn = cat(1,Qn{:});

nk = size(Zn,2);
%---------------------------------
% 2SLS estimation of model 3
Y3 = PD3*Yn;
Z3 = PD3*Zn;
Q3 = PD3*Qn;
QQ = Q3'*Q3;
QZ = Q3'*Z3;
QY = Q3'*Y3;
PI = QQ\QZ;
ZZ = (QZ'*PI)\PI';
b3(:,j) = ZZ*QY;
nfirm(j) = np;
end
sampling_out = ['sampling_output\sampling_output_' int2str(sampler*10) '.mat'];
save(sampling_out,'b3','nfirm');
%% print results
fid = fopen(['sampling_output\sampling_output_' int2str(sampler*10) '.txt'],'a');
fprintf(fid,'%s \n',date);
fprintf(fid,'Data: %s \n',dataset);
fprintf(fid,'alliance duration = %i \n',lnkyear);
fprintf(fid,'SIC = %i \n',sicdgit);
fprintf(fid,'start year is %i\n',yr1);
fprintf(fid,'end year is %i\n',yr2);
fprintf(fid,'t0 is %i\n',t0);
fprintf(fid,'RND stock time lag is %i\n',timelag);
fprintf(fid,'# of replications is %i \n',rep);
fprintf(fid,'sampling rate is %3.2f \n',sampler);
fprintf(fid,'avg # of firms is %7.2f \n',mean(nfirm));

fprintf(fid,'*******************\n');
fprintf(fid,'with firm and time fixed effects \n');
for ip = 1:nk
    fprintf(fid,' %7.4f\n (%6.4f)\n',mean(b3(ip,:)),std(b3(ip,:)));
end
fprintf(fid,'*******************\n\n');
fclose(fid);